**01/28=9/2022
**file to bring in CPD FOIA attendance and assignment files & assign unique IDs

**Housekeepig
local ver_suff _9_21
clear all
if "`c(username)'"=="taehokim" |"`c(username)'"=="Taeho Kim"{ 
	cd "/Users/`c(username)'/Dropbox (Penn Law)/misconduct/do_files`ver_suff'"
} 
else{
	cd "/Users/`c(username)'/Dropbox/misconduct/do_files`ver_suff'"
}
set more off

//later pass through profiles dataset to fuzzy match 
local profiles_match `1'

 

**Append all datasets together, drop duplicates again, and begin cleaning
use ../processed_data`ver_suff'/attendance/assign_dup1, clear

forval k = 2/26{
append using ../processed_data`ver_suff'/attendance/assign_dup`k'
}

duplicates drop officername race sex yearofbirth appointed_date, force

gen idmaster = _n 
destring yearofbirth, replace

************************
*GET FIRST, MIDDLE, LAST NAMES
************************
drop if officername == ""
split officername, gen(name)
local max_num = r(nvars)
 
gen first = name1  


//Middle name initial. For Irish last name starting with O, that O is not the middle initial. Correct for this

gen middle_initial = name2 if length(name2) == 1  

preserve
clear
use ../processed_data`ver_suff'/FOIA_processing/profiles_pid

keep if substr(lname, 1,2) == "O "
count
levelsof lname
local irish = r(levels)

restore

gen name_23 = name2 + " " + name3
foreach n of local irish{
replace middle_initial = "" if name_23 == "`n'"
}

//suffix
gen num_names = `max_num'
forval i = 2/`max_num'{
replace num_names = num_names -1 if name`i' == ""
}

gen suffix = ""
forval i = 2/`max_num'{
replace suffix = name`i' if num_names == `i' & (inlist(name`i', "JR", "SR", "JR." "SR.", "I", "II", "III")|inlist(name`i', "IV", "V", "VI","VII", "VIII"))
replace name`i' = "" if num_names == `i' & (inlist(name`i', "JR", "SR", "JR." "SR.", "I", "II", "III")|inlist(name`i', "IV", "V", "VI","VII", "VIII")) // remove name that will be used for last name later
}

//last name: will concatenate all names excepts for the first
replace name2 = "" if name2 == middle_initial //exclude the component captured by middle initial

gen last = name2 
forval k = 3/`max_num'{
replace last = last+" " +name`k' if `k'<=num_names & name`k' !=""
}

keep officername race sex yearofbirth appointed_date idmaster first middle_initial suffix last

rename first fname
rename last last_name
rename yearofbirth birthyear 
  
/*
gen gender = "MALE" if sex == "M"
replace gender = "FEMALE" if sex == "F"
drop sex
rename race race_field
*/

gen lname = last_name
replace lname = last_name +" " + suffix if suffix!=""
rename middle_initial mi

replace fname = trim(fname)
replace lname = trim(lname)
replace mi = trim(mi)

gen appoint_yr = year(appointed_date)
gen appoint_m = month(appointed_date)
gen appoint_d = day(appointed_date)

//some obs have variations of last names: deflate last names
gen lname_mod = subinstr(lname, "-", "", .)
replace lname_mod = subinstr(lname_mod, " ", "", .)
replace lname_mod = subinstr(lname_mod, ".", "", .)


gen fname_sdx = soundex(fname)
gen lname_sdx = soundex(lname)

save ../processed_data`ver_suff'/FOIA_processing/AA.dta, replace

gen pid = _n

save ../processed_data`ver_suff'/FOIA_processing/AA_pid.dta, replace
 
*******************
*SET UP LIST OF DATA TO STORE MATCHED IDS
*******************
clear 

set obs 1
gen u_pid = -1
gen pid = -1

save ../processed_data`ver_suff'/FOIA_processing/pid_appending, replace
clear


*******************
*FUZZY MATCHING
*******************

use ../processed_data`ver_suff'/FOIA_processing/AA_pid.dta, clear // start with 21,211

**Perfect name, appoint date, birth year  match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 1 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 1 //16,390 so far

**Perfect name, appoint date, and birth year match, no middle initial
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 2  //20992

**Perfect first name, soundex last name, appoint date, and   no middle initial
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 1 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 3

**Soundex both names, middle initial, and appoint date, match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 1 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 4  

**Soundex both names, no middle initial, and appoint date,  match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 5  

**first name, no middle initial, no last name and appoint date match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 0 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 6  

**first name, no middle initial, no last name and appoint date  match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 0 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 7

**first name, last name, off in appoint_yr 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 1 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 9  


**first name, last name, off in appoint_mo 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 1 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 10  


**first name, last name, off in appoint_day 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 1 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 11  

**off birth year

do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 2 1 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 12  


**first name, last name, no other match 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0  3 3 3 3 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 13  
 
**soundex first name, soundex last name, no other match 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 0  3 3 3 3 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 14  
   
 
 
*******************
*KEEP BEST MATCHES
******************* 

use ../processed_data`ver_suff'/FOIA_processing/pid_appending.dta, clear

//drop the less good matches if individuals in awards data are matched with more than one in profiles

sort pid match_no
quietly by pid:  gen dup = cond(_N==1,0,_n)
sort dup
 

drop if dup>1 
drop dup 
sort pid match_no

tempfile best_matches
save `best_matches'

*******************
*MERGE DE-DUPLICATED ARREST DATA WITH MATCHINGS
******************* 

use ../processed_data`ver_suff'/FOIA_processing/AA_pid.dta, clear

merge 1:1 pid using `best_matches', keep(1 3) //  21,044 MATCHED AMONG 21,211

 
 



 
